#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 1995- ECMWF.
! (C) Copyright 1995- Meteo-France.
! (C) Copyright 2022- NVIDIA.
!
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRGTOL_MOD
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL

  TYPE TRGTOL_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL
  END TYPE
CONTAINS
  FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRB, JPRBT
    USE TPM_DISTR,              ONLY: D
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE ISO_C_BINDING,          ONLY: C_SIZE_T

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_GP, KF_FS
    TYPE(TRGTOL_HANDLE) :: HTRGTOL

    REAL(KIND=JPRBT) :: DUMMY

    INTEGER(KIND=C_SIZE_T) :: NELEM

    HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, int(KF_GP*D%NGPTOT*SIZEOF(DUMMY),kind=c_size_t))

    NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR
    NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL
    HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM)
  END FUNCTION PREPARE_TRGTOL

  SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,&
     &PGP,PGPUV,PGP3A,PGP3B,PGP2,KPTRGP,KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2)

    !**** *TRGTOL * - transposition of grid point data from column
    !                 structure to latitudinal. Reorganize data between
    !                 grid point calculations and direct Fourier Transform

    ! Version using CUDA-aware MPI

    !     Purpose.
    !     --------


    !**   Interface.
    !     ----------
    !        *call* *trgtol(...)

    !        Explicit arguments :
    !        --------------------
    !           PREEL_REAL    -  Latitudinal data ready for direct FFT (output)
    !           PGP    -  Blocked grid point data    (input)

    !        Implicit arguments :
    !        --------------------

    !     Method.
    !     -------
    !        See documentation

    !     Externals.
    !     ----------

    !     Reference.
    !     ----------
    !        ECMWF Research Department documentation of the IFS

    !     Author.
    !     -------
    !        MPP Group *ECMWF*

    !     Modifications.
    !     --------------
    !        Original: 95-10-01
    !        D.Dent  : 97-08-04   Reorganisation to allow
    !                             NPRTRV to differ from NPRGPEW
    !                : 98-06-17   add mailbox control logic (from TRLTOM)
    !        =99-03-29= Mats Hamrud and Deborah Salmond
    !                   JUMP in FFT's changed to 1
    !                   KINDEX introduced and ZCOMBUF not used for same PE
    !         01-11-23  Deborah Salmond and John Hague
    !                    LIMP_NOOLAP Option for non-overlapping message passing
    !                    and buffer packing
    !         01-12-18  Peter Towers
    !                   Improved vector performance of GTOL_PACK,GTOL_UNPACK
    !         03-04-02  G. Radnoti: call barrier always when nproc>1
    !         08-01-01  G.Mozdzynski: cleanup
    !         09-01-02  G.Mozdzynski: use non-blocking recv and send
    !     ------------------------------------------------------------------

    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRB, JPRBT
    USE YOMHOOK,                ONLY: LHOOK, DR_HOOK, JPHOOK
    USE MPL_MODULE,             ONLY: MPL_WAIT, MPL_BARRIER
    USE TPM_GEN,                ONLY: LSYNC_TRANS
    USE EQ_REGIONS_MOD,         ONLY: MY_REGION_EW, MY_REGION_NS
    USE TPM_DISTR,              ONLY: D, MYSETV, MYSETW, MTAGLG, NPRCIDS, MYPROC, NPROC, NPRTRW, &
      &                               NPRTRV
    USE PE2SET_MOD,             ONLY: PE2SET
    USE MPL_DATA_MODULE,        ONLY: MPL_COMM_OML
    USE OML_MOD,                ONLY: OML_MY_THREAD
#if ECTRANS_HAVE_MPI
    USE MPI_F08,                ONLY: MPI_COMM, MPI_REQUEST, MPI_FLOAT, MPI_DOUBLE
    ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157)
#endif
    USE TPM_STATS,              ONLY: GSTATS => GSTATS_NVTX
    USE TPM_TRANS,              ONLY: NPROMA
    USE ISO_C_BINDING,          ONLY: C_SIZE_T, C_FLOAT, C_DOUBLE, C_INT8_T
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE OPENACC_EXT,            ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, &
      &                               EXT_ACC_DELETE
    USE OPENACC,                ONLY: ACC_HANDLE_KIND
    USE ABORT_TRANS_MOD,        ONLY: ABORT_TRANS

    IMPLICIT NONE

    REAL(KIND=JPRBT),INTENT(OUT), POINTER :: PREEL_REAL(:)
    INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KPTRGP(:)
    INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:), KVSETSC(:), KVSETSC3A(:), KVSETSC3B(:), KVSETSC2(:)
    REAL(KIND=JPRB),OPTIONAL,INTENT(IN) :: PGP(:,:,:), PGPUV(:,:,:,:), PGP3A(:,:,:,:), PGP3B(:,:,:,:), PGP2(:,:,:)

    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRGTOL_HANDLE), INTENT(IN) :: HTRGTOL

    ! LOCAL VARIABLES

    !     LOCAL INTEGER SCALARS
    REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:)

    INTEGER(KIND=JPIM) :: ISENDTOT (NPROC)
    INTEGER(KIND=JPIM) :: IRECVTOT (NPROC)
    INTEGER(KIND=JPIM) :: IREQ     (NPROC*2)
    INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC)
    INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC)

    INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,&
                 &ILASTLAT, ILEN, JROC, IPOS, ISETA, &
                 &ISETB, IRECV, &
                 &ISETV, ISEND, JBLK, JFLD, &
                 &JGL, JI, JK, JL, ISETW,  IFLD, &
                 &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, &
                 &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT
    INTEGER(KIND=JPIM) :: KF, KGL, KI, J3

    INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP
    INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V
    INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V
    INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V
    INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V
    INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:)
    INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V
    INTEGER(KIND=JPIM) :: IFLDA(KF_GP)
    INTEGER(KIND=JPIM) :: IVSET(KF_GP)

    REAL(KIND=JPHOOK) :: ZHOOK_HANDLE

    INTEGER(JPIM), PARAMETER :: PGP_INDICES_UV = 1
    INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP2 = 2
    INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3A = 3
    INTEGER(JPIM), PARAMETER :: PGP_INDICES_GP3B = 4
    INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5
    INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END)

    TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins...
    INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0

#if ECTRANS_HAVE_MPI
    TYPE(MPI_COMM) :: LOCAL_COMM
    TYPE(MPI_REQUEST) :: IREQUEST(2*NPROC)
#endif

#ifdef PARKINDTRANS_SINGLE
#define TRGTOL_DTYPE MPI_FLOAT
#else
#define TRGTOL_DTYPE MPI_DOUBLE
#endif

#if ECTRANS_HAVE_MPI
    LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() )
#endif

    !     ------------------------------------------------------------------

    !*       0.    Some initializations
    !              --------------------
    ! Note we have either
    ! - KVSETUV and KVSETSC (with PGP, which has u, v, and scalar fields), or
    ! - KVSETUV, KVSETSC2, KVSETSC3A KVSETSC3B (with PGPUV, GP3A, PGP3B and PGP2)
    ! KVSETs are optionals. Their sizes canalso be inferred from KV_UV_G/KV_SCALARS_G (which
    ! should match PSPXXX and PGPXXX arrays)
    IOFF=0
    IF(PRESENT(KVSETUV)) THEN
      IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:)
      IOFF=IOFF+KF_UV_G
      IVSET(IOFF+1:IOFF+KF_UV_G) = KVSETUV(:)
      IOFF=IOFF+KF_UV_G
    ELSE
      IVSET(IOFF+1:IOFF+KF_UV_G) = -1
      IOFF=IOFF+KF_UV_G
      IVSET(IOFF+1:IOFF+KF_UV_G) = -1
      IOFF=IOFF+KF_UV_G
    ENDIF
    IF(PRESENT(KVSETSC)) THEN
      IVSET(IOFF+1:IOFF+KF_SCALARS_G) = KVSETSC(:)
      IOFF=IOFF+KF_SCALARS_G
    ELSE
      IF(PRESENT(KVSETSC2)) THEN
        IVSET(IOFF+1:IOFF+SIZE(KVSETSC2)) = KVSETSC2(:)
        IOFF=IOFF+SIZE(KVSETSC2)
      ENDIF
      IF(PRESENT(KVSETSC3A)) THEN
        DO J3=1,SIZE(PGP3A,3)
          IVSET(IOFF+1:IOFF+SIZE(KVSETSC3A))=KVSETSC3A(:)
          IOFF=IOFF+SIZE(KVSETSC3A)
        ENDDO
      ENDIF
      IF(PRESENT(KVSETSC3B)) THEN
        DO J3=1,SIZE(PGP3B,3)
          IVSET(IOFF+1:IOFF+SIZE(KVSETSC3B))=KVSETSC3B(:)
          IOFF=IOFF+SIZE(KVSETSC3B)
        ENDDO
      ENDIF
    ENDIF

    IF (IOFF /= 2*KF_UV_G+KF_SCALARS_G) THEN
      CALL ABORT_TRANS("TRGTOL: ERROR in IVSET computation")
    ENDIF

    IF (LHOOK) CALL DR_HOOK('TRGTOL',0,ZHOOK_HANDLE)

    CALL GSTATS(1805,0)
    IOFF=1
    PGP_INDICES(PGP_INDICES_UV) = IOFF
    IF (PRESENT(PGPUV)) IOFF=IOFF+UBOUND(PGPUV,2)*2
    PGP_INDICES(PGP_INDICES_GP2) = IOFF
    IF (PRESENT(PGP2)) IOFF=IOFF+UBOUND(PGP2,2)
    PGP_INDICES(PGP_INDICES_GP3A) = IOFF
    IF (PRESENT(PGP3A)) IOFF=IOFF+UBOUND(PGP3A,2)*UBOUND(PGP3A,3)
    PGP_INDICES(PGP_INDICES_GP3B) = IOFF
    IF (PRESENT(PGP3B)) IOFF=IOFF+UBOUND(PGP3B,2)*UBOUND(PGP3B,3)
    PGP_INDICES(PGP_INDICES_END) = IOFF

    ! Prepare sender arrays
    ! find number of fields on a certain V-set
    IF(NPRTRV == 1) THEN
      ! This is needed because IVSET(JFLD) == -1 if there is only one V-set
      ISEND_FIELD_COUNT(1) = KF_GP
    ELSE
      ISEND_FIELD_COUNT(:) = 0
      DO JFLD=1,KF_GP
        ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1
      ENDDO
    ENDIF
    ! find number of grid-points on a certain W-set that overlap with myself
    ISEND_WSET_SIZE(:) = 0
    DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS)
      ILAT_STRIP = ILOCAL_LAT-D%NFRSTLAT(MY_REGION_NS)+D%NPTRFLOFF+1
      ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT)) = &
          & ISEND_WSET_SIZE(D%NPROCL(ILOCAL_LAT))+D%NONL(ILAT_STRIP,MY_REGION_EW)
    ENDDO
    ! sum up offsets
    ISEND_WSET_OFFSET(1) = 0
    DO JROC=1,NPRTRW
      ISEND_WSET_OFFSET(JROC+1)=ISEND_WSET_OFFSET(JROC)+ISEND_WSET_SIZE(JROC)
    ENDDO
    DO JROC=1,NPROC
      CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV)
      ! total send size is # points per field * # fields
      ISENDTOT(JROC) = ISEND_WSET_SIZE(ISETW)*ISEND_FIELD_COUNT(ISETV)
    ENDDO

    ! Prepare receiver arrays
    IRECV_BUFR_TO_OUT_OFFSET(:) = 0
    DO JROC=1,NPROC
      ! Get new offset to my current KINDEX entry
      IF (JROC > 1 .AND. KF_FS > 0) THEN
        IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)+IRECVTOT(JROC-1)/KF_FS
      ELSEIF (JROC > 1) THEN
        IRECV_BUFR_TO_OUT_OFFSET(JROC) = IRECV_BUFR_TO_OUT_OFFSET(JROC-1)
      ENDIF

      CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV)

      ! MAX(Index of first fourier latitude for this W set, first latitude of a senders A set)
      ! i.e. we find the overlap between what we have on sender side (others A set) and the receiver
      ! (me, the W-set). Ideally those conincide, at least mostly.
      IFIRSTLAT = MAX(D%NPTRLS(MYSETW),D%NFRSTLAT(ISETA))
      ! MIN(Index of last fourier latitude for this W set, last latitude of a senders A set)
      ILASTLAT  = MIN(D%NPTRLS(MYSETW)+D%NULTPP(MYSETW)-1,D%NLSTLAT(ISETA))

      IPOS = 0
      DO JGL=IFIRSTLAT,ILASTLAT
        ! get from "actual" latitude to the latitude strip offset
        IGL  = JGL-D%NFRSTLAT(ISETA)+D%NPTRFRSTLAT(ISETA)
        ! get from "actual" latitude to the latitude offset
        IGLL = JGL-D%NPTRLS(MYSETW)+1
        DO JL=1,D%NONL(IGL,ISETB)
          IPOS = IPOS+1
          ! offset to first layer of this gridpoint
          IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,1) = &
              & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1)
          ! distance between two layers of this gridpoint
          IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_OFFSET(JROC)+IPOS,2) = &
              & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL)
        ENDDO
      ENDDO
      !we always receive the full fourier space
      IRECVTOT(JROC) = IPOS*KF_FS
    ENDDO

    block
    CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
        & int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1,kind=c_size_t), int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1)),kind=c_size_t))
    !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) 
    end block

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT) PRESENT(PREEL_REAL) IF (KF_FS > 0) ASYNC(1)
    !$ACC DATA COPYIN(PGP_INDICES) ASYNC(1)
#endif

    CALL GSTATS(1805,1)

    ! Put data on device for copyin
    IF (LSYNC_TRANS) THEN
#ifdef ACCGPU
      !$ACC WAIT(1)
#endif
      CALL GSTATS(430,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(430,1)
    ENDIF
    CALL GSTATS(412,0)
    ACC_POINTERS_CNT = 0
    IF (PRESENT(PGP)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP)
    ENDIF
    IF (PRESENT(PGPUV)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV)
    ENDIF
    IF (PRESENT(PGP2)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2)
    ENDIF
    IF (PRESENT(PGP3A)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A)
    ENDIF
    IF (PRESENT(PGP3B)) THEN
      ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1
      ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B)
    ENDIF
    IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)
    !$ACC WAIT(1)
    IF (PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC UPDATE DEVICE(PGP)
#endif
    ENDIF
    IF (PRESENT(PGPUV)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC UPDATE DEVICE(PGPUV)
#endif
    ENDIF
    IF (PRESENT(PGP2)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC UPDATE DEVICE(PGP2)
#endif
    ENDIF
    IF (PRESENT(PGP3A)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC UPDATE DEVICE(PGP3A)
#endif
    ENDIF
    IF (PRESENT(PGP3B)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC UPDATE DEVICE(PGP3B)
#endif
    ENDIF
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA IF(PRESENT(PGP) .AND. KF_GP > 0)   PRESENT(PGP) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP2))  PRESENT(PGP2) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1)
    !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1)
#endif
    IF (LSYNC_TRANS) THEN
#ifdef ACCGPU
      !$ACC WAIT(1)
#endif
      CALL GSTATS(432,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(432,1)
    ENDIF
    CALL GSTATS(412,1)

    ! Figure out processes that send or recv something
    ISEND_COUNTS   = 0
    IRECV_COUNTS   = 0
    DO JROC=1,NPROC
      IF( JROC /= MYPROC) THEN
        IF(IRECVTOT(JROC) > 0) THEN
          ! I have to recv something, so let me store that
          IRECV_COUNTS = IRECV_COUNTS + 1
          IRECV_TO_PROC(IRECV_COUNTS)=JROC
        ENDIF
        IF(ISENDTOT(JROC) > 0) THEN
          ! I have to send something, so let me store that
          ISEND_COUNTS = ISEND_COUNTS+1
          ISEND_TO_PROC(ISEND_COUNTS)=JROC
        ENDIF
      ENDIF
    ENDDO

    ALLOCATE(ICOMBUFS_OFFSET(ISEND_COUNTS+1))
    ICOMBUFS_OFFSET(1) = 0
    DO JROC=1,ISEND_COUNTS
      ICOMBUFS_OFFSET(JROC+1) = ICOMBUFS_OFFSET(JROC) + ISENDTOT(ISEND_TO_PROC(JROC))
    ENDDO
    ALLOCATE(ICOMBUFR_OFFSET(IRECV_COUNTS+1))
    ICOMBUFR_OFFSET(1) = 0
    DO JROC=1,IRECV_COUNTS
      ICOMBUFR_OFFSET(JROC+1) = ICOMBUFR_OFFSET(JROC) + IRECVTOT(IRECV_TO_PROC(JROC))
    ENDDO

    IF (ISEND_COUNTS > 0) THEN
      CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),&
          & 1_C_SIZE_T, int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t))
    ENDIF

    !....Pack loop.........................................................
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA IF(ISEND_COUNTS > 0) PRESENT(ZCOMBUFS) ASYNC(1)
#endif

    CALL GSTATS(1602,0)
    DO INS=1,ISEND_COUNTS
      ISEND=ISEND_TO_PROC(INS)
      CALL PE2SET(ISEND,ISETA,ISETB,ISETW,ISETV)

      ISEND_FIELD_COUNT_V = ISEND_FIELD_COUNT(ISETV)
      ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS)

      IFLDS = 0
      DO JFLD=1,KF_GP
        IF(IVSET(JFLD) == ISETV .OR. IVSET(JFLD) == -1 ) THEN
          IFLDS = IFLDS+1
          IF(PRESENT(KPTRGP)) THEN
            IFLDA(IFLDS)=KPTRGP(JFLD)
          ELSE
            IFLDA(IFLDS)=JFLD
          ENDIF
        ENDIF
      ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC DATA COPYIN(IFLDA(1:ISEND_FIELD_COUNT_V)) ASYNC(1)
#endif

      ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(ISETW)
      ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(ISETW)
      IF(PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) &
        !$ACC&              FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
        !$ACC&              ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,ISEND_FIELD_COUNT_V
          DO JL=1,ISEND_WSET_SIZE_V
            JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL
            ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK)
          ENDDO
        ENDDO
      ELSE
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI,IOFF,PBOUND) &
        !$ACC&              FIRSTPRIVATE(ISEND_FIELD_COUNT_V,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
        !$ACC&              ICOMBUFS_OFFSET_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,ISEND_FIELD_COUNT_V
          DO JL=1,ISEND_WSET_SIZE_V
            JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL
            IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV)
              PBOUND=UBOUND(PGPUV,2)
              ! TODO we could certainly reshape PGPXX arrays and we would simplify this
              ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2)
              ZCOMBUFS(JI)  = PGP2(JK,IOFF+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A)
              PBOUND=UBOUND(PGP3A,2)
              ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B)
              PBOUND=UBOUND(PGP3B,2)
              ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ENDIF
         ENDDO
        ENDDO
      ENDIF
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC END DATA
#endif
    ENDDO
#ifdef ACCGPU
    !$ACC WAIT(1)
#endif
    CALL GSTATS(1602,1)

    IF (LSYNC_TRANS) THEN
      CALL GSTATS(430,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(430,1)
    ENDIF

    CALL GSTATS(411,0)
    IF (IRECV_COUNTS > 0) THEN
      CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),&
          & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t))
    ENDIF
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC DATA IF(IRECV_COUNTS > 0) PRESENT(ZCOMBUFR) ASYNC(1)
#endif

    IR=0

#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC HOST_DATA USE_DEVICE(ZCOMBUFR,ZCOMBUFS)
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE HOST(ZCOMBUFS) IF(ISEND_COUNTS > 0)
#endif
    !  Receive loop.........................................................
    DO INR=1,IRECV_COUNTS
      IR=IR+1
      IPROC=IRECV_TO_PROC(INR)
#if ECTRANS_HAVE_MPI
      CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), &
        & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR)
      IREQ(IR) = IREQUEST(IR)%MPI_VAL
#else
      CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
    ENDDO

    !....Send loop.........................................................
    DO INS=1,ISEND_COUNTS
      IR=IR+1
      ISEND=ISEND_TO_PROC(INS)
#if ECTRANS_HAVE_MPI
      CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), &
       & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR)
      IREQ(IR) = IREQUEST(IR)%MPI_VAL
#else
      CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
    ENDDO

    ! Copy local contribution
    IF(ISENDTOT(MYPROC) > 0 )THEN
      ! I have to send something to myself...

      ! Input is KF_GP fields. We find the resulting KF_FS fields.
      IFLDS = 0
      DO JFLD=1,KF_GP
        IF(IVSET(JFLD) == MYSETV .OR. IVSET(JFLD) == -1) THEN
          IFLDS = IFLDS+1
          IF(PRESENT(KPTRGP)) THEN
            IFLDA(IFLDS) = KPTRGP(JFLD)
          ELSE
            IFLDA(IFLDS) = JFLD
          ENDIF
        ENDIF
      ENDDO

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC DATA COPYIN(IFLDA(1:IFLDS)) ASYNC(1)
#endif

      ISEND_WSET_OFFSET_V = ISEND_WSET_OFFSET(MYSETW)
      ISEND_WSET_SIZE_V = ISEND_WSET_SIZE(MYSETW)
      IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(MYPROC)
      CALL GSTATS(1601,0)
      IF(PRESENT(PGP)) THEN
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF) &
        !$ACC&              FIRSTPRIVATE(KF_FS,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V,&
        !$ACC&              IRECV_BUFR_TO_OUT_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,KF_FS
          DO JL=1,ISEND_WSET_SIZE_V
            JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ &
                & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1
            PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK)
          ENDDO
        ENDDO
      ELSE
#ifdef OMPGPU
#endif
#ifdef ACCGPU
        !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS,IOFF,PBOUND) &
        !$ACC&              FIRSTPRIVATE(KF_FS,ISEND_WSET_SIZE_V,ISEND_WSET_OFFSET_V, &
        !$ACC&              IRECV_BUFR_TO_OUT_V,NPROMA) ASYNC(1)
#endif
        DO JFLD=1,KF_FS
          DO JL=1,ISEND_WSET_SIZE_V
            JK = MOD(ISEND_WSET_OFFSET_V+JL-1,NPROMA)+1
            JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1
            IFLD = IFLDA(JFLD)
            IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ &
                & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1
            IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV)
              PBOUND=UBOUND(PGPUV,2)
              PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2)
              PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A)
              PBOUND=UBOUND(PGP3A,2)
              PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN
              IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B)
              PBOUND=UBOUND(PGP3B,2)
              PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK)
            ENDIF
          ENDDO
        ENDDO
      ENDIF
      CALL GSTATS(1601,1)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC END DATA
#endif

    ENDIF


    IF(IR > 0) THEN
      CALL MPL_WAIT(KREQUEST=IREQ(1:IR), &
        & CDSTRING='TRGTOL: WAIT FOR SENDS AND RECEIVES')
    ENDIF
#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END HOST_DATA
#endif
#else
    !! this is safe-but-slow fallback for running without GPU-aware MPI
    !$ACC UPDATE DEVICE(ZCOMBUFR) IF(IRECV_COUNTS > 0)
#endif
    IF (LSYNC_TRANS) THEN
      CALL GSTATS(431,0)
      CALL MPL_BARRIER(CDSTRING='')
      CALL GSTATS(431,1)
    ENDIF
    CALL GSTATS(411,1)

    !  Unpack loop.........................................................

    CALL GSTATS(1603,0)
    DO INR=1,IRECV_COUNTS
      IPROC=IRECV_TO_PROC(INR)
      ILEN = IRECVTOT(IPROC)/KF_FS
      IRECV_BUFR_TO_OUT_V = IRECV_BUFR_TO_OUT_OFFSET(IPROC)
      ICOMBUFR_OFFSET_V = ICOMBUFR_OFFSET(INR)
#ifdef OMPGPU
#endif
#ifdef ACCGPU
      !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(II,IPOS) FIRSTPRIVATE(KF_FS,ILEN, &
      !$ACC&              IRECV_BUFR_TO_OUT_V,ICOMBUFR_OFFSET_V) ASYNC(1)
#endif
      DO JFLD=1,KF_FS
        DO JL=1,ILEN
          IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ &
              & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1
          PREEL_REAL(IPOS) = ZCOMBUFR(ICOMBUFR_OFFSET_V+JL+(JFLD-1)*ILEN)
        ENDDO
      ENDDO
    ENDDO
#ifdef ACCGPU
    !$ACC WAIT(1)
#endif
    CALL GSTATS(1603,1)

#ifdef OMPGPU
#endif
#ifdef ACCGPU
    !$ACC END DATA ! ZCOMBUFR
    !$ACC END DATA ! IRECV_BUFR_TO_OUT
    !$ACC END DATA ! PGPINDICES
    !$ACC END DATA !ZCOMBUFS (present)
    !$ACC END DATA !PGP3B
    !$ACC END DATA !PGP3A
    !$ACC END DATA !PGP2
    !$ACC END DATA !PGPUV
    !$ACC END DATA !PGP
#endif
    IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND)

    IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE)
  END SUBROUTINE TRGTOL
END MODULE TRGTOL_MOD
